def insert_data(conn, csv_file):
cursor = conn.cursor()
# Open and parse CSV file
with open(csv_file, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
try:
# Insert into Country table
cursor.execute("""
INSERT OR IGNORE INTO Country (CountryName, Status)
VALUES (?, ?)
""", (row['Country'], row['Status']))
# Get CountryID for foreign key reference
cursor.execute("SELECT CountryID FROM Country WHERE CountryName = ?", (row['Country'],))
country_id = cursor.fetchone()[0]
# Insert into YearlyData table
cursor.execute("""
INSERT INTO YearlyData (
CountryID, Year, LifeExpectancy, AdultMortality, InfantDeaths, Alcohol,
PercentageExpenditure, HepatitisB, Measles, BMI, UnderFiveDeaths, Polio,
TotalExpenditure, Diphtheria, HIV_AIDS, GDP, Population, Thinness_1_19,
Thinness_5_9, IncomeComposition, Schooling
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
country_id,
int(row['Year']),
float(row['Life expectancy ']) if row['Life expectancy '] else None,
int(row['Adult Mortality']) if row['Adult Mortality'] else None,
int(row['infant deaths']) if row['infant deaths'] else None,
float(row['Alcohol']) if row['Alcohol'] else None,
float(row['percentage expenditure']) if row['percentage expenditure'] else None,
int(row['Hepatitis B']) if row['Hepatitis B'] else None,
int(row['Measles ']) if row['Measles '] else None,
float(row[' BMI ']) if row[' BMI '] else None,
int(row['under-five deaths ']) if row['under-five deaths '] else None,
int(row['Polio']) if row['Polio'] else None,
float(row['Total expenditure']) if row['Total expenditure'] else None,
int(row['Diphtheria ']) if row['Diphtheria '] else None,
float(row[' HIV/AIDS']) if row[' HIV/AIDS'] else None,
float(row['GDP']) if row['GDP'] else None,
float(row['Population']) if row['Population'] else None,
float(row[' thinness 1-19 years']) if row[' thinness 1-19 years'] else None,
float(row[' thinness 5-9 years']) if row[' thinness 5-9 years'] else None,
float(row['Income composition of resources']) if row['Income composition of resources'] else None,
float(row['Schooling']) if row['Schooling'] else None
))
# Commit after each successful insert to release the lock quickly
conn.commit()
except sqlite3.Error as e:
print(f"Error: {e}")
conn.rollback() # Rollback in case of an error
# Close cursor
cursor.close()
import sqlite3
import pandas as pd
def fetch_data_to_dataframe():
# Connect to the SQLite database
conn = sqlite3.connect("life_expectancy.db")
# SQL query without explicit casting
query = """
SELECT
c.CountryName AS Country,
c.Status AS Status,
y.Year AS Year,
y.LifeExpectancy AS LifeExpectancy,
y.AdultMortality AS AdultMortality,
y.InfantDeaths AS InfantDeaths,
y.Alcohol AS AlcoholConsumption
FROM
YearlyData y
JOIN
Country c ON y.CountryID = c.CountryID;
"""
# Execute query and load into Pandas DataFrame
df = pd.read_sql_query(query, conn)
conn.close()
# Convert columns to appropriate data types
df['Year'] = df['Year'].astype(int)
df['LifeExpectancy'] = pd.to_numeric(df['LifeExpectancy'], errors='coerce')
df['AdultMortality'] = pd.to_numeric(df['AdultMortality'], errors='coerce', downcast='integer')
df['InfantDeaths'] = pd.to_numeric(df['InfantDeaths'], errors='coerce', downcast='integer')
df['AlcoholConsumption'] = pd.to_numeric(df['AlcoholConsumption'], errors='coerce')
return df
# Fetch data and display the DataFrame
df = fetch_data_to_dataframe()
print(df.dtypes) # Verify column data types
print(df.head())
---------------------------------------------------------------------------
OperationalError Traceback (most recent call last)
File /opt/anaconda3/lib/python3.12/site-packages/pandas/io/sql.py:2674, in SQLiteDatabase.execute(self, sql, params)
2673 try:
-> 2674 cur.execute(sql, *args)
2675 return cur
OperationalError: no such table: YearlyData
The above exception was the direct cause of the following exception:
DatabaseError Traceback (most recent call last)
Cell In[2], line 39
36 return df
38 # Fetch data and display the DataFrame
---> 39 df = fetch_data_to_dataframe()
40 print(df.dtypes) # Verify column data types
41 print(df.head())
Cell In[2], line 25, in fetch_data_to_dataframe()
9 query = """
10 SELECT
11 c.CountryName AS Country,
(...)
21 Country c ON y.CountryID = c.CountryID;
22 """
24 # Execute query and load into Pandas DataFrame
---> 25 df = pd.read_sql_query(query, conn)
27 conn.close()
29 # Convert columns to appropriate data types
File /opt/anaconda3/lib/python3.12/site-packages/pandas/io/sql.py:526, in read_sql_query(sql, con, index_col, coerce_float, params, parse_dates, chunksize, dtype, dtype_backend)
523 assert dtype_backend is not lib.no_default
525 with pandasSQL_builder(con) as pandas_sql:
--> 526 return pandas_sql.read_query(
527 sql,
528 index_col=index_col,
529 params=params,
530 coerce_float=coerce_float,
531 parse_dates=parse_dates,
532 chunksize=chunksize,
533 dtype=dtype,
534 dtype_backend=dtype_backend,
535 )
File /opt/anaconda3/lib/python3.12/site-packages/pandas/io/sql.py:2738, in SQLiteDatabase.read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize, dtype, dtype_backend)
2727 def read_query(
2728 self,
2729 sql,
(...)
2736 dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
2737 ) -> DataFrame | Iterator[DataFrame]:
-> 2738 cursor = self.execute(sql, params)
2739 columns = [col_desc[0] for col_desc in cursor.description]
2741 if chunksize is not None:
File /opt/anaconda3/lib/python3.12/site-packages/pandas/io/sql.py:2686, in SQLiteDatabase.execute(self, sql, params)
2683 raise ex from inner_exc
2685 ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}")
-> 2686 raise ex from exc
DatabaseError: Execution failed on sql '
SELECT
c.CountryName AS Country,
c.Status AS Status,
y.Year AS Year,
y.LifeExpectancy AS LifeExpectancy,
y.AdultMortality AS AdultMortality,
y.InfantDeaths AS InfantDeaths,
y.Alcohol AS AlcoholConsumption
FROM
YearlyData y
JOIN
Country c ON y.CountryID = c.CountryID;
': no such table: YearlyData
print(df.dtypes)
Country object
Status object
Year int32
LifeExpectancy int64
AdultMortality int8
InfantDeaths int8
AlcoholConsumption int64
dtype: object
import pandas as pd
# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")
# Check for missing or invalid values in 'Status'
if 'Status' not in df.columns:
raise ValueError("'Status' column is missing from the dataset.")
# Drop rows with missing 'Status'
df = df.dropna(subset=['Status'])
# Ensure there are at least two classes in 'Status' for stratification
if len(df['Status'].unique()) < 2:
raise ValueError("Stratification not possible: Only one class present in 'Status'.")
# Define features (X) and target (y)
X = df.drop(columns=['Life expectancy ', 'Country']) # Drop target and unrelated columns
y = df['Life expectancy '] # Target column
from sklearn.model_selection import train_test_split
# Perform train/test split with stratification on 'Status'
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=X['Status']
)
# Verify stratification
print("Train set distribution:")
print(X_train['Status'].value_counts(normalize=True))
print("\nTest set distribution:")
print(X_test['Status'].value_counts(normalize=True))
Train set distribution:
Status
Developing 0.825532
Developed 0.174468
Name: proportion, dtype: float64
Test set distribution:
Status
Developing 0.826531
Developed 0.173469
Name: proportion, dtype: float64
import pandas as pd
from ydata_profiling import ProfileReport
# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")
# Generate the profiling report
profile = ProfileReport(df, title="Life Expectancy Data Profile", explorative=True)
# Save the report to an HTML file (optional)
profile.to_file("life_expectancy_profile_report.html")
# Display the report in a Jupyter Notebook (if applicable)
profile.to_notebook_iframe()